To run this notebook reproducibly, follow these steps:
In [ ]:
g_timestamp = ""
g_dataset_name = "20160510_A549"
g_count_alg_name = "19mer_1mm_py"
g_fastq_counts_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160510_D00611_0278_BHK55CBCXX_A549'
g_fastq_counts_run_prefix = "19mer_1mm_py_20160615223822"
g_collapsed_counts_dir = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/processed/20160510_A549"
g_collapsed_counts_run_prefix = ""
g_combined_counts_dir = ""
g_combined_counts_run_prefix = ""
g_code_location = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python"
In [ ]:
import sys
sys.path.append(g_code_location)
In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
description_list = ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
return "".join(description_list)
In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_timestamp = check_or_set(g_timestamp, get_timestamp())
g_collapsed_counts_dir = check_or_set(g_collapsed_counts_dir, g_fastq_counts_dir)
g_collapsed_counts_run_prefix = check_or_set(g_collapsed_counts_run_prefix,
get_run_prefix(g_dataset_name, g_count_alg_name, g_timestamp))
g_combined_counts_dir = check_or_set(g_combined_counts_dir, g_collapsed_counts_dir)
g_combined_counts_run_prefix = check_or_set(g_combined_counts_run_prefix, g_collapsed_counts_run_prefix)
print(describe_var_list(['g_timestamp','g_collapsed_counts_dir','g_collapsed_counts_run_prefix',
'g_combined_counts_dir', 'g_combined_counts_run_prefix']))
In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_collapsed_counts_dir)
verify_or_make_dir(g_combined_counts_dir)
In [ ]:
# %load -s get_counts_file_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_counter.py
def get_counts_file_suffix():
return "counts.txt"
In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_combination.py
# ccbb libraries
from ccbbucsd.utilities.analysis_run_prefixes import strip_run_prefix
from ccbbucsd.utilities.files_and_paths import build_multipart_fp, group_files, get_filepaths_by_prefix_and_suffix
# project-specific libraries
from ccbbucsd.malicrispr.count_files_and_dataframes import get_counts_df
__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"
def get_collapsed_counts_file_suffix():
return "collapsed.txt"
def get_combined_counts_file_suffix():
return "counts_combined.txt"
def group_lane_and_set_files(filepaths):
# NB: this regex assumes read designator has *already* been removed
# and replaced with _ as done by group_read_pairs
return group_files(filepaths, "_L\d\d\d_\d\d\d", "")
def combine_count_files(counts_fp_for_dataset, run_prefix):
combined_df = None
for curr_counts_fp in counts_fp_for_dataset:
count_header, curr_counts_df = get_counts_df(curr_counts_fp, run_prefix)
if combined_df is None:
combined_df = curr_counts_df
else:
combined_df[count_header] = curr_counts_df[count_header]
return combined_df
def write_collapsed_count_files(input_dir, output_dir, curr_run_prefix, counts_run_prefix, counts_suffix, counts_collapsed_file_suffix):
counts_fps_for_dataset = get_filepaths_by_prefix_and_suffix(input_dir, counts_run_prefix, counts_suffix)
fps_by_sample = group_lane_and_set_files(counts_fps_for_dataset)
for curr_sample, curr_fps in fps_by_sample.items():
stripped_sample = strip_run_prefix(curr_sample, counts_run_prefix)
output_fp = build_multipart_fp(output_dir, [curr_run_prefix, stripped_sample, counts_collapsed_file_suffix])
combined_df = None
for curr_fp in curr_fps:
count_header, curr_counts_df = get_counts_df(curr_fp, counts_run_prefix)
if combined_df is None:
combined_df = curr_counts_df
combined_df.rename(columns = {count_header:stripped_sample}, inplace = True)
else:
combined_df[stripped_sample] = combined_df[stripped_sample] + curr_counts_df[count_header]
combined_df.to_csv(output_fp, sep="\t", index=False)
def write_combined_count_file(input_dir, output_dir, curr_run_prefix, counts_run_prefix, counts_suffix, combined_suffix):
output_fp = build_multipart_fp(output_dir, [curr_run_prefix, combined_suffix])
counts_fps_for_run = get_filepaths_by_prefix_and_suffix(input_dir, counts_run_prefix, counts_suffix)
combined_df = combine_count_files(counts_fps_for_run, curr_run_prefix)
combined_df.to_csv(output_fp, sep="\t", index=False)
In [ ]:
from ccbbucsd.utilities.files_and_paths import summarize_filenames_for_prefix_and_suffix
print(summarize_filenames_for_prefix_and_suffix(g_fastq_counts_dir, g_fastq_counts_run_prefix, get_counts_file_suffix()))
In [ ]:
write_collapsed_count_files(g_fastq_counts_dir, g_collapsed_counts_dir, g_collapsed_counts_run_prefix,
g_fastq_counts_run_prefix, get_counts_file_suffix(), get_collapsed_counts_file_suffix())
In [ ]:
write_combined_count_file(g_collapsed_counts_dir, g_combined_counts_dir, g_collapsed_counts_run_prefix,
g_combined_counts_run_prefix, get_collapsed_counts_file_suffix(),
get_combined_counts_file_suffix())